Load necessary packages.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.2
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggwordcloud)
## Warning: package 'ggwordcloud' was built under R version 4.3.2

Read the dataset and drop NA vlaues.

youtube_df <- read_csv("Data/cleaned_youtube_df.csv") %>% 
  janitor::clean_names() %>% 
  drop_na()
## Rows: 995 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): category, country, abbreviation, channel_type
## dbl (14): id, subscribers, video_views, uploads, video_views_for_the_last_30...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(youtube_df)
##        id         subscribers         video_views          category        
##  Min.   :  1.0   Min.   : 12300000   Min.   :2.634e+03   Length:589        
##  1st Qu.:213.0   1st Qu.: 14700000   1st Qu.:4.910e+09   Class :character  
##  Median :454.0   Median : 18600000   Median :8.761e+09   Mode  :character  
##  Mean   :471.2   Mean   : 24264516   Mean   :1.280e+10                     
##  3rd Qu.:723.0   3rd Qu.: 26400000   3rd Qu.:1.507e+10                     
##  Max.   :995.0   Max.   :245000000   Max.   :2.280e+11                     
##     uploads         country          abbreviation       channel_type      
##  Min.   :     1   Length:589         Length:589         Length:589        
##  1st Qu.:   433   Class :character   Class :character   Class :character  
##  Median :  1189   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 13945                                                           
##  3rd Qu.:  3882                                                           
##  Max.   :301308                                                           
##  video_views_for_the_last_30_days lowest_monthly_earnings
##  Min.   :3.000e+00                Min.   :     0         
##  1st Qu.:4.845e+07                1st Qu.: 11500         
##  Median :1.143e+08                Median : 28200         
##  Mean   :2.446e+08                Mean   : 54260         
##  3rd Qu.:2.441e+08                3rd Qu.: 59000         
##  Max.   :6.589e+09                Max.   :850900         
##  highest_monthly_earnings lowest_yearly_earnings highest_yearly_earnings
##  Min.   :       0         Min.   :       0       Min.   :        0      
##  1st Qu.:  183200         1st Qu.:  137400       1st Qu.:  2200000      
##  Median :  451100         Median :  338300       Median :  5400000      
##  Mean   :  867499         Mean   :  650618       Mean   : 10417717      
##  3rd Qu.:  944000         3rd Qu.:  708000       3rd Qu.: 11300000      
##  Max.   :13600000         Max.   :10200000       Max.   :163400000      
##  subscribers_for_last_30_days  created_year    population       
##  Min.   :      1              Min.   :1970   Min.   :2.025e+05  
##  1st Qu.: 100000              1st Qu.:2010   1st Qu.:1.081e+08  
##  Median : 200000              Median :2013   Median :3.282e+08  
##  Mean   : 356695              Mean   :2013   Mean   :4.906e+08  
##  3rd Qu.: 400000              3rd Qu.:2016   3rd Qu.:3.282e+08  
##  Max.   :8000000              Max.   :2022   Max.   :1.398e+09  
##     latitude        longitude       
##  Min.   :-38.42   Min.   :-172.105  
##  1st Qu.: 20.59   1st Qu.: -95.713  
##  Median : 30.59   Median :  -3.436  
##  Mean   : 26.21   Mean   :  -6.254  
##  3rd Qu.: 37.09   3rd Qu.:  78.963  
##  Max.   : 61.52   Max.   : 138.253

We use histogram and density plot to intuitively visualize the distribution of different variables.

all_columns <- colnames(youtube_df)
columns_to_plot <- all_columns[!all_columns %in% c("id", "category","country","abbreviation","channel_type","population","latitude","longitude","created_year")]


numeric_data_long <- 
  youtube_df[, columns_to_plot] %>% 
  gather(key = "variable", value = "value")


# Create a single plot with facets for each numeric variable
p <- ggplot(numeric_data_long, aes(x = value)) +
  geom_histogram(aes(y = ..density..),bins = 15, fill = "#8dab7f", alpha = 0.8) +
  geom_density(color="#6b8e23")+
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  scale_x_continuous(labels = scales::comma) +
  theme_minimal(base_size = 10) +  
  theme(
    strip.text.x = element_text(size = 10, face = "bold"), 
    axis.text.x = element_text(angle = 20, hjust = 1, vjust = 1,size=7,face = "bold"), # Angle x-axis labels for readability
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    plot.title = element_text(size = 16, face = "bold"),
    plot.margin = margin(1, 1, 1, 1, "cm"), # Adjust the plot margins
    strip.background = element_blank(),
    panel.spacing = unit(3, "lines")
  ) +
  labs(
    title = "Distribution of Numeric Variables",
    x = "Value",
    y = "Frequency",
    caption = "Source: YouTube Data"
  )
# Convert to an interactive plot
ggplotly(p)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## ℹ The deprecated feature was likely used in the ggplot2 package.
##   Please report the issue at <https://github.com/tidyverse/ggplot2/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We plot heatmap to study the relationships among numerical variables.

# Calculate the correlation matrix
cor_matrix <- cor(youtube_df[, columns_to_plot], use = "complete.obs")

fig <- plot_ly(x = colnames(cor_matrix), y = rownames(cor_matrix), z = cor_matrix, 
               type = "heatmap",colorscale ="Greens"  , zmin = -1, zmax = 1)

fig <- fig %>% layout(
  yaxis = list(autorange = "reversed"),
  width=800,
  height=600,
  title = "Correlation Matrix")
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
fig
category_data <- youtube_df %>%
  filter(!is.na(category) & category != "nan") %>%
  count(category) %>%
  mutate(n=n*30) %>% 
  ungroup()


category_data$scaled_size <- log(category_data$n + 1) # adding 1 to avoid log(0)

wordcloud_plot <- ggplot(category_data, aes(label = category, size = scaled_size)) +
  geom_text_wordcloud(
    aes(color = n),
    shape = 'circle',
    rm_outside = TRUE
  ) +
  scale_size_area(max_size = 10) + 
  scale_color_gradient(low = "#ffcc99", high = "#8dab7f") +
  theme_void(base_family = "sans") +
  theme(legend.position = "none", 
        plot.margin = margin(1, 1, 1, 1, "cm")) # Adjust margins around the plot

# Display the plot
wordcloud_plot